# conding=utf8
import zipfile
import os

from pyhanlp import *
from pyhanlp.static import download, remove_file, HANLP_DATA_PATH


def test_data_path():
	"""
    获取测试数据路径，位于$root/data/test，根目录由配置文件指定。
    :return:
    """
	data_path = os.path.join(HANLP_DATA_PATH, 'test')
	if not os.path.isdir(data_path):
		os.mkdir(data_path)
	return data_path


## 验证是否存在 MSR语料库，如果没有自动下载
def ensure_data(data_name, data_url):
	root_path = test_data_path()
	dest_path = os.path.join(root_path, data_name)
	if os.path.exists(dest_path):
		return dest_path

	if data_url.endswith('.zip'):
		dest_path += '.zip'
	download(data_url, dest_path)
	if data_url.endswith('.zip'):
		with zipfile.ZipFile(dest_path, "r") as archive:
			archive.extractall(root_path)
		remove_file(dest_path)
		dest_path = dest_path[:-len('.zip')]
	return dest_path


sighan05 = ensure_data('icwb2-data', 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip')
msr_train = os.path.join(sighan05, 'training', 'msr_training.utf8')

## ===============================================
## 以下开始中文分词

FirstOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.FirstOrderHiddenMarkovModel')
SecondOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.SecondOrderHiddenMarkovModel')
HMMSegmenter = JClass('com.hankcs.hanlp.model.hmm.HMMSegmenter')


def train(corpus, model):
	segmenter = HMMSegmenter(model)
	segmenter.train(corpus)
	return segmenter.toSegment()


def evaluate(segment):
	result = CWSEvaluator.evaluate(segment, msr_test, msr_output, msr_gold, msr_dict)
	print(result)


if __name__ == '__main__':
	segment = train(msr_train, FirstOrderHiddenMarkovModel())
	HanLP.Config.ShowTermNature = False  # 关闭现实词性

	print(segment.seg('商品和服务'))
	print(segment.seg('商品和货币'))
